import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

df = pd.read_csv("csvFile/movieData.csv")
print(df.info())
# print(df.head(1))

#电影评分的平均分
# print((df["Rating"]).mean())

#导演的人数
# print(len(set(df["Director"].tolist())))
# print(len(df["Director"].unique()))

#演员的个数
# tempActors = df["Actors"].str.split(", ").tolist()
# print(tempActors)
# actors = []
# for tempActor in tempActors:
#     for actor in tempActor:
#         actors.append(actor)
#
# print(len(set(actors)))

#统计电影时长的分布情况
# print(df["Runtime (Minutes)"])
# runtimeList = df["Runtime (Minutes)"].tolist()
#
# plt.figure(figsize=(80,8),dpi=100)
# plt.grid()
# plt.xticks(range(min(runtimeList),max(runtimeList)+5,5))
#
# plt.hist(runtimeList,range(min(runtimeList),max(runtimeList)+5,5))
# plt.show()

movieLabels = df["Genre"].str.split(",").tolist()
# print(movieLabels)
allLabels = []
for tempLabels in movieLabels:
    for tempLabel in tempLabels:
        allLabels.append(tempLabel)
label = set(allLabels)
# print(label)

df2 = pd.DataFrame(np.zeros((df.shape[0],len(label))),columns=label)
# print(df2)
for i in range(df.shape[0]):
    # print(i,movieLabels[i])
    df2.loc[i,movieLabels[i]] = 1
print(df2.sum(axis=0))


